kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. kreuzberg/__init__.py +9 -2
  2. kreuzberg/_api/__init__.py +0 -0
  3. kreuzberg/_api/main.py +87 -0
  4. kreuzberg/_entity_extraction.py +238 -0
  5. kreuzberg/_extractors/_base.py +39 -1
  6. kreuzberg/_extractors/_email.py +149 -0
  7. kreuzberg/_extractors/_html.py +15 -3
  8. kreuzberg/_extractors/_image.py +27 -22
  9. kreuzberg/_extractors/_pandoc.py +3 -14
  10. kreuzberg/_extractors/_pdf.py +97 -34
  11. kreuzberg/_extractors/_presentation.py +62 -10
  12. kreuzberg/_extractors/_spread_sheet.py +181 -6
  13. kreuzberg/_extractors/_structured.py +148 -0
  14. kreuzberg/_gmft.py +318 -11
  15. kreuzberg/_language_detection.py +95 -0
  16. kreuzberg/_mcp/__init__.py +5 -0
  17. kreuzberg/_mcp/server.py +227 -0
  18. kreuzberg/_mime_types.py +27 -1
  19. kreuzberg/_ocr/__init__.py +10 -1
  20. kreuzberg/_ocr/_base.py +59 -0
  21. kreuzberg/_ocr/_easyocr.py +92 -1
  22. kreuzberg/_ocr/_paddleocr.py +89 -0
  23. kreuzberg/_ocr/_tesseract.py +569 -5
  24. kreuzberg/_registry.py +4 -0
  25. kreuzberg/_types.py +181 -4
  26. kreuzberg/_utils/_cache.py +52 -4
  27. kreuzberg/_utils/_device.py +2 -2
  28. kreuzberg/_utils/_errors.py +3 -7
  29. kreuzberg/_utils/_process_pool.py +182 -9
  30. kreuzberg/_utils/_quality.py +237 -0
  31. kreuzberg/_utils/_serialization.py +4 -2
  32. kreuzberg/_utils/_string.py +153 -10
  33. kreuzberg/_utils/_sync.py +6 -7
  34. kreuzberg/_utils/_table.py +261 -0
  35. kreuzberg/_utils/_tmp.py +2 -2
  36. kreuzberg/cli.py +1 -2
  37. kreuzberg/extraction.py +43 -34
  38. kreuzberg-3.8.1.dist-info/METADATA +301 -0
  39. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  40. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
  41. kreuzberg/_multiprocessing/__init__.py +0 -6
  42. kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
  43. kreuzberg/_multiprocessing/process_manager.py +0 -188
  44. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  45. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  46. kreuzberg-3.3.0.dist-info/METADATA +0 -235
  47. kreuzberg-3.3.0.dist-info/RECORD +0 -48
  48. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  49. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py CHANGED
@@ -1,11 +1,15 @@
1
+ from importlib.metadata import version
2
+
3
+ from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
1
4
  from kreuzberg._gmft import GMFTConfig
5
+ from kreuzberg._language_detection import LanguageDetectionConfig
2
6
  from kreuzberg._ocr._easyocr import EasyOCRConfig
3
7
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
4
8
  from kreuzberg._ocr._tesseract import TesseractConfig
5
9
 
6
10
  from ._ocr._tesseract import PSMMode
7
11
  from ._registry import ExtractorRegistry
8
- from ._types import ExtractionConfig, ExtractionResult, Metadata, TableData
12
+ from ._types import Entity, ExtractionConfig, ExtractionResult, Metadata, TableData
9
13
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
10
14
  from .extraction import (
11
15
  batch_extract_bytes,
@@ -18,21 +22,24 @@ from .extraction import (
18
22
  extract_file_sync,
19
23
  )
20
24
 
21
- __version__ = "3.2.0"
25
+ __version__ = version("kreuzberg")
22
26
 
23
27
  __all__ = [
24
28
  "EasyOCRConfig",
29
+ "Entity",
25
30
  "ExtractionConfig",
26
31
  "ExtractionResult",
27
32
  "ExtractorRegistry",
28
33
  "GMFTConfig",
29
34
  "KreuzbergError",
35
+ "LanguageDetectionConfig",
30
36
  "Metadata",
31
37
  "MissingDependencyError",
32
38
  "OCRError",
33
39
  "PSMMode",
34
40
  "PaddleOCRConfig",
35
41
  "ParsingError",
42
+ "SpacyEntityExtractionConfig",
36
43
  "TableData",
37
44
  "TesseractConfig",
38
45
  "ValidationError",
File without changes
kreuzberg/_api/main.py ADDED
@@ -0,0 +1,87 @@
1
+ from __future__ import annotations
2
+
3
+ from json import dumps
4
+ from typing import TYPE_CHECKING, Annotated, Any
5
+
6
+ from kreuzberg import (
7
+ ExtractionResult,
8
+ KreuzbergError,
9
+ MissingDependencyError,
10
+ ParsingError,
11
+ ValidationError,
12
+ batch_extract_bytes,
13
+ )
14
+
15
+ if TYPE_CHECKING:
16
+ from litestar.datastructures import UploadFile
17
+
18
+ try:
19
+ from litestar import Litestar, Request, Response, get, post
20
+ from litestar.contrib.opentelemetry import OpenTelemetryConfig, OpenTelemetryPlugin
21
+ from litestar.enums import RequestEncodingType
22
+ from litestar.logging import StructLoggingConfig
23
+ from litestar.params import Body
24
+ from litestar.status_codes import (
25
+ HTTP_400_BAD_REQUEST,
26
+ HTTP_422_UNPROCESSABLE_ENTITY,
27
+ HTTP_500_INTERNAL_SERVER_ERROR,
28
+ )
29
+ except ImportError as e:
30
+ raise MissingDependencyError.create_for_package(
31
+ dependency_group="litestar",
32
+ functionality="Litestar API and docker container",
33
+ package_name="litestar",
34
+ ) from e
35
+
36
+
37
+ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError) -> Response[Any]:
38
+ if isinstance(exception, ValidationError):
39
+ status_code = HTTP_400_BAD_REQUEST
40
+ elif isinstance(exception, ParsingError):
41
+ status_code = HTTP_422_UNPROCESSABLE_ENTITY
42
+ else:
43
+ status_code = HTTP_500_INTERNAL_SERVER_ERROR
44
+
45
+ message = str(exception)
46
+ details = dumps(exception.context)
47
+
48
+ if request.app.logger:
49
+ request.app.logger.error(
50
+ "API error",
51
+ method=request.method,
52
+ url=str(request.url),
53
+ status_code=status_code,
54
+ message=message,
55
+ context=exception.context,
56
+ )
57
+
58
+ return Response(
59
+ content={"message": message, "details": details},
60
+ status_code=status_code,
61
+ )
62
+
63
+
64
+ @post("/extract", operation_id="ExtractFiles")
65
+ async def handle_files_upload(
66
+ data: Annotated[list[UploadFile], Body(media_type=RequestEncodingType.MULTI_PART)],
67
+ ) -> list[ExtractionResult]:
68
+ """Extracts text content from an uploaded file."""
69
+ return await batch_extract_bytes(
70
+ [(await file.read(), file.content_type) for file in data],
71
+ )
72
+
73
+
74
+ @get("/health", operation_id="HealthCheck")
75
+ async def health_check() -> dict[str, str]:
76
+ """A simple health check endpoint."""
77
+ return {"status": "ok"}
78
+
79
+
80
+ app = Litestar(
81
+ route_handlers=[handle_files_upload, health_check],
82
+ plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
83
+ logging_config=StructLoggingConfig(),
84
+ exception_handlers={
85
+ KreuzbergError: exception_handler,
86
+ },
87
+ )
@@ -0,0 +1,238 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ from dataclasses import dataclass
6
+ from functools import lru_cache
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ from kreuzberg._types import Entity
10
+ from kreuzberg.exceptions import MissingDependencyError
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Sequence
14
+ from pathlib import Path
15
+
16
+
17
+ @dataclass(unsafe_hash=True, frozen=True)
18
+ class SpacyEntityExtractionConfig:
19
+ """Configuration for spaCy-based entity extraction."""
20
+
21
+ model_cache_dir: str | Path | None = None
22
+ """Directory to cache spaCy models. If None, uses spaCy's default."""
23
+
24
+ language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
25
+ """Mapping of language codes to spaCy model names.
26
+
27
+ If None, uses default mappings:
28
+ - en: en_core_web_sm
29
+ - de: de_core_news_sm
30
+ - fr: fr_core_news_sm
31
+ - es: es_core_news_sm
32
+ - pt: pt_core_news_sm
33
+ - it: it_core_news_sm
34
+ - nl: nl_core_news_sm
35
+ - zh: zh_core_web_sm
36
+ - ja: ja_core_news_sm
37
+ """
38
+
39
+ fallback_to_multilingual: bool = True
40
+ """If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
41
+
42
+ max_doc_length: int = 1000000
43
+ """Maximum document length for spaCy processing."""
44
+
45
+ batch_size: int = 1000
46
+ """Batch size for processing multiple texts."""
47
+
48
+ def __post_init__(self) -> None:
49
+ if self.language_models is None:
50
+ object.__setattr__(self, "language_models", self._get_default_language_models())
51
+
52
+ if isinstance(self.language_models, dict):
53
+ object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
54
+
55
+ @staticmethod
56
+ def _get_default_language_models() -> dict[str, str]:
57
+ """Get default language model mappings based on available spaCy models."""
58
+ return {
59
+ "en": "en_core_web_sm",
60
+ "de": "de_core_news_sm",
61
+ "fr": "fr_core_news_sm",
62
+ "es": "es_core_news_sm",
63
+ "pt": "pt_core_news_sm",
64
+ "it": "it_core_news_sm",
65
+ "nl": "nl_core_news_sm",
66
+ "zh": "zh_core_web_sm",
67
+ "ja": "ja_core_news_sm",
68
+ "ko": "ko_core_news_sm",
69
+ "ru": "ru_core_news_sm",
70
+ "pl": "pl_core_news_sm",
71
+ "ro": "ro_core_news_sm",
72
+ "el": "el_core_news_sm",
73
+ "da": "da_core_news_sm",
74
+ "fi": "fi_core_news_sm",
75
+ "nb": "nb_core_news_sm",
76
+ "sv": "sv_core_news_sm",
77
+ "ca": "ca_core_news_sm",
78
+ "hr": "hr_core_news_sm",
79
+ "lt": "lt_core_news_sm",
80
+ "mk": "mk_core_news_sm",
81
+ "sl": "sl_core_news_sm",
82
+ "uk": "uk_core_news_sm",
83
+ }
84
+
85
+ def get_model_for_language(self, language_code: str) -> str | None:
86
+ """Get the appropriate spaCy model for a language code."""
87
+ if not self.language_models:
88
+ return None
89
+
90
+ models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
91
+
92
+ if language_code in models_dict:
93
+ return models_dict[language_code]
94
+
95
+ base_lang = language_code.split("-")[0].lower()
96
+ if base_lang in models_dict:
97
+ return models_dict[base_lang]
98
+
99
+ return None
100
+
101
+ def get_fallback_model(self) -> str | None:
102
+ """Get fallback multilingual model if enabled."""
103
+ return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
104
+
105
+
106
+ def extract_entities(
107
+ text: str,
108
+ entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
109
+ custom_patterns: frozenset[tuple[str, str]] | None = None,
110
+ languages: list[str] | None = None,
111
+ spacy_config: SpacyEntityExtractionConfig | None = None,
112
+ ) -> list[Entity]:
113
+ """Extract entities from text using custom regex patterns and/or a NER model.
114
+
115
+ Args:
116
+ text: The input text to extract entities from.
117
+ entity_types: List of entity types to extract using the NER model.
118
+ custom_patterns: Tuple mapping entity types to regex patterns for custom extraction.
119
+ languages: List of detected languages to choose appropriate spaCy models.
120
+ spacy_config: Configuration for spaCy entity extraction.
121
+
122
+ Returns:
123
+ list[Entity]: A list of extracted Entity objects with type, text, start, and end positions.
124
+
125
+ Raises:
126
+ MissingDependencyError: If `spacy` is not installed.
127
+ """
128
+ entities: list[Entity] = []
129
+ if custom_patterns:
130
+ custom_patterns_dict = dict(custom_patterns)
131
+ for ent_type, pattern in custom_patterns_dict.items():
132
+ entities.extend(
133
+ Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
134
+ for match in re.finditer(pattern, text)
135
+ )
136
+
137
+ if spacy_config is None:
138
+ spacy_config = SpacyEntityExtractionConfig()
139
+
140
+ try:
141
+ import spacy # noqa: F401
142
+ except ImportError as e:
143
+ raise MissingDependencyError.create_for_package(
144
+ package_name="spacy",
145
+ dependency_group="entity-extraction",
146
+ functionality="Entity Extraction",
147
+ ) from e
148
+
149
+ model_name = _select_spacy_model(languages, spacy_config)
150
+ if not model_name:
151
+ return entities
152
+
153
+ nlp = _load_spacy_model(model_name, spacy_config)
154
+ if not nlp:
155
+ return entities
156
+
157
+ if len(text) > spacy_config.max_doc_length:
158
+ text = text[: spacy_config.max_doc_length]
159
+
160
+ doc = nlp(text)
161
+
162
+ entity_type_mapping = {etype.upper() for etype in entity_types}
163
+
164
+ entities.extend(
165
+ Entity(
166
+ type=ent.label_,
167
+ text=ent.text,
168
+ start=ent.start_char,
169
+ end=ent.end_char,
170
+ )
171
+ for ent in doc.ents
172
+ if ent.label_ in entity_type_mapping or ent.label_.upper() in entity_type_mapping
173
+ )
174
+
175
+ return entities
176
+
177
+
178
+ @lru_cache(maxsize=32)
179
+ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
180
+ """Load a spaCy model with caching."""
181
+ try:
182
+ import spacy
183
+
184
+ if spacy_config.model_cache_dir:
185
+ os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
186
+
187
+ nlp = spacy.load(model_name)
188
+
189
+ nlp.max_length = spacy_config.max_doc_length
190
+
191
+ return nlp
192
+ except (OSError, ImportError):
193
+ return None
194
+
195
+
196
+ def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
197
+ """Select the best spaCy model based on detected languages."""
198
+ if not languages:
199
+ return spacy_config.get_model_for_language("en")
200
+
201
+ for lang in languages:
202
+ model_name = spacy_config.get_model_for_language(lang)
203
+ if model_name:
204
+ return model_name
205
+
206
+ return spacy_config.get_fallback_model()
207
+
208
+
209
+ def extract_keywords(
210
+ text: str,
211
+ keyword_count: int = 10,
212
+ ) -> list[tuple[str, float]]:
213
+ """Extract keywords from text using the KeyBERT model.
214
+
215
+ Args:
216
+ text: The input text to extract keywords from.
217
+ keyword_count: Number of top keywords to return. Defaults to 10.
218
+
219
+ Returns:
220
+ list[tuple[str, float]]: A list of tuples containing keywords and their relevance scores.
221
+
222
+ Raises:
223
+ MissingDependencyError: If `keybert` is not installed.
224
+ """
225
+ try:
226
+ from keybert import KeyBERT
227
+
228
+ kw_model = KeyBERT()
229
+ keywords = kw_model.extract_keywords(text, top_n=keyword_count)
230
+ return [(kw, float(score)) for kw, score in keywords]
231
+ except (RuntimeError, OSError, ValueError):
232
+ return []
233
+ except ImportError as e:
234
+ raise MissingDependencyError.create_for_package(
235
+ package_name="keybert",
236
+ dependency_group="entity-extraction",
237
+ functionality="Keyword Extraction",
238
+ ) from e
@@ -3,10 +3,12 @@ from __future__ import annotations
3
3
  from abc import ABC, abstractmethod
4
4
  from typing import TYPE_CHECKING, ClassVar
5
5
 
6
+ from kreuzberg._types import ExtractionResult, normalize_metadata
7
+ from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
8
+
6
9
  if TYPE_CHECKING:
7
10
  from pathlib import Path
8
11
 
9
- from kreuzberg import ExtractionResult
10
12
  from kreuzberg._types import ExtractionConfig
11
13
 
12
14
 
@@ -90,3 +92,39 @@ class Extractor(ABC):
90
92
  return mime_type in cls.SUPPORTED_MIME_TYPES or any(
91
93
  mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
92
94
  )
95
+
96
+ def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
97
+ """Apply quality post-processing to extraction result if enabled.
98
+
99
+ Args:
100
+ result: The raw extraction result
101
+
102
+ Returns:
103
+ Enhanced extraction result with quality improvements (if enabled)
104
+ """
105
+ # Only apply quality processing if enabled in config
106
+ if not self.config.enable_quality_processing:
107
+ return result
108
+
109
+ if not result.content:
110
+ return result
111
+
112
+ # Clean the content
113
+ cleaned_content = clean_extracted_text(result.content)
114
+
115
+ # Calculate quality score
116
+ quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
117
+
118
+ # Add quality metadata
119
+ enhanced_metadata = dict(result.metadata) if result.metadata else {}
120
+ enhanced_metadata["quality_score"] = quality_score
121
+
122
+ # Return enhanced result
123
+ return ExtractionResult(
124
+ content=cleaned_content,
125
+ mime_type=result.mime_type,
126
+ metadata=normalize_metadata(enhanced_metadata),
127
+ chunks=result.chunks,
128
+ detected_languages=result.detected_languages,
129
+ tables=result.tables,
130
+ )
@@ -0,0 +1,149 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from html import unescape
5
+ from typing import TYPE_CHECKING, Any, ClassVar
6
+
7
+ from anyio import Path as AsyncPath
8
+
9
+ from kreuzberg._extractors._base import Extractor
10
+ from kreuzberg._mime_types import EML_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
11
+ from kreuzberg._types import ExtractionResult, normalize_metadata
12
+ from kreuzberg._utils._string import normalize_spaces
13
+ from kreuzberg._utils._sync import run_sync
14
+ from kreuzberg.exceptions import MissingDependencyError
15
+
16
+ if TYPE_CHECKING:
17
+ from pathlib import Path
18
+
19
+ # Import optional dependencies at module level with proper error handling
20
+ try:
21
+ import mailparse
22
+ except ImportError:
23
+ mailparse = None
24
+
25
+ try:
26
+ import html2text # type: ignore[import-not-found]
27
+ except ImportError:
28
+ html2text = None
29
+
30
+ # Compile regex pattern once at module level
31
+ _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
32
+
33
+
34
+ class EmailExtractor(Extractor):
35
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {EML_MIME_TYPE}
36
+
37
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
38
+ return await run_sync(self.extract_bytes_sync, content)
39
+
40
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
41
+ content = await AsyncPath(path).read_bytes()
42
+ return await self.extract_bytes_async(content)
43
+
44
+ def _extract_email_headers(
45
+ self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
46
+ ) -> None:
47
+ """Extract and process email headers."""
48
+ # Use single dict access where possible to avoid repeated lookups
49
+ subject = parsed_email.get("subject")
50
+ if subject:
51
+ metadata["subject"] = subject
52
+ text_parts.append(f"Subject: {subject}")
53
+
54
+ from_info = parsed_email.get("from")
55
+ if from_info:
56
+ from_email = from_info.get("email", "") if isinstance(from_info, dict) else str(from_info)
57
+ metadata["email_from"] = from_email
58
+ text_parts.append(f"From: {from_email}")
59
+
60
+ to_info = parsed_email.get("to")
61
+ if to_info:
62
+ if isinstance(to_info, list) and to_info:
63
+ to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
64
+ elif isinstance(to_info, dict):
65
+ to_email = to_info.get("email", "")
66
+ else:
67
+ to_email = str(to_info)
68
+ metadata["email_to"] = to_email
69
+ text_parts.append(f"To: {to_email}")
70
+
71
+ date = parsed_email.get("date")
72
+ if date:
73
+ metadata["date"] = date
74
+ text_parts.append(f"Date: {date}")
75
+
76
+ cc = parsed_email.get("cc")
77
+ if cc:
78
+ metadata["email_cc"] = cc
79
+ text_parts.append(f"CC: {cc}")
80
+
81
+ bcc = parsed_email.get("bcc")
82
+ if bcc:
83
+ metadata["email_bcc"] = bcc
84
+ text_parts.append(f"BCC: {bcc}")
85
+
86
+ def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
87
+ """Extract and process email body content."""
88
+ text_content = parsed_email.get("text")
89
+ if text_content:
90
+ text_parts.append(f"\n{text_content}")
91
+ return # If we have text, prefer it over HTML
92
+
93
+ html_content = parsed_email.get("html")
94
+ if html_content:
95
+ if html2text is not None:
96
+ # Use html2text if available (faster path)
97
+ h = html2text.HTML2Text()
98
+ h.ignore_links = True
99
+ h.ignore_images = True
100
+ converted_text = h.handle(html_content)
101
+ text_parts.append(f"\n{converted_text}")
102
+ else:
103
+ # Fallback: strip HTML tags and unescape entities
104
+ clean_html = _HTML_TAG_PATTERN.sub("", html_content)
105
+ clean_html = unescape(clean_html)
106
+ text_parts.append(f"\n{clean_html}")
107
+
108
+ def _extract_email_attachments(
109
+ self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
110
+ ) -> None:
111
+ """Extract and process email attachments info."""
112
+ if parsed_email.get("attachments"):
113
+ attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
114
+ metadata["attachments"] = attachment_names
115
+ if attachment_names:
116
+ text_parts.append(f"\nAttachments: {', '.join(attachment_names)}")
117
+
118
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
119
+ if mailparse is None:
120
+ msg = "mailparse is required for email extraction. Install with: pip install 'kreuzberg[additional-extensions]'"
121
+ raise MissingDependencyError(msg)
122
+
123
+ try:
124
+ parsed_email = mailparse.EmailDecode.load(content)
125
+ text_parts: list[str] = []
126
+ metadata: dict[str, Any] = {}
127
+
128
+ # Extract headers, body, and attachments
129
+ self._extract_email_headers(parsed_email, text_parts, metadata)
130
+ self._extract_email_body(parsed_email, text_parts)
131
+ self._extract_email_attachments(parsed_email, text_parts, metadata)
132
+
133
+ # Join efficiently
134
+ combined_text = "\n".join(text_parts)
135
+
136
+ return ExtractionResult(
137
+ content=normalize_spaces(combined_text),
138
+ mime_type=PLAIN_TEXT_MIME_TYPE,
139
+ metadata=normalize_metadata(metadata),
140
+ chunks=[],
141
+ )
142
+
143
+ except Exception as e:
144
+ msg = f"Failed to parse email content: {e}"
145
+ raise RuntimeError(msg) from e
146
+
147
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
148
+ content = path.read_bytes()
149
+ return self.extract_bytes_sync(content)
@@ -8,7 +8,7 @@ from anyio import Path as AsyncPath
8
8
  from kreuzberg._extractors._base import Extractor
9
9
  from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
10
10
  from kreuzberg._types import ExtractionResult
11
- from kreuzberg._utils._string import normalize_spaces, safe_decode
11
+ from kreuzberg._utils._string import safe_decode
12
12
  from kreuzberg._utils._sync import run_sync
13
13
 
14
14
  if TYPE_CHECKING:
@@ -26,8 +26,20 @@ class HTMLExtractor(Extractor):
26
26
  return await run_sync(self.extract_bytes_sync, content)
27
27
 
28
28
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
29
- result = html_to_markdown.convert_to_markdown(safe_decode(content))
30
- return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
29
+ # Use html-to-markdown with script/nav removal for better quality
30
+ result = html_to_markdown.convert_to_markdown(
31
+ safe_decode(content),
32
+ preprocess_html=True,
33
+ preprocessing_preset="aggressive",
34
+ remove_navigation=True,
35
+ remove_forms=True,
36
+ )
37
+
38
+ # Skip normalize_spaces since quality processing will handle whitespace
39
+ extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
40
+
41
+ # Apply quality processing which includes normalization
42
+ return self._apply_quality_processing(extraction_result)
31
43
 
32
44
  def extract_path_sync(self, path: Path) -> ExtractionResult:
33
45
  content = path.read_bytes()
@@ -1,5 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import contextlib
4
+ import os
5
+ import tempfile
6
+ from pathlib import Path
3
7
  from typing import TYPE_CHECKING, ClassVar
4
8
 
5
9
  from anyio import Path as AsyncPath
@@ -7,6 +11,9 @@ from anyio import Path as AsyncPath
7
11
  from kreuzberg._extractors._base import Extractor
8
12
  from kreuzberg._mime_types import IMAGE_MIME_TYPES
9
13
  from kreuzberg._ocr import get_ocr_backend
14
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
15
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
16
+ from kreuzberg._ocr._tesseract import TesseractConfig
10
17
  from kreuzberg._utils._tmp import create_temp_file
11
18
  from kreuzberg.exceptions import ValidationError
12
19
 
@@ -15,9 +22,6 @@ if TYPE_CHECKING: # pragma: no cover
15
22
 
16
23
  from kreuzberg._types import ExtractionResult
17
24
 
18
- import contextlib
19
- from pathlib import Path
20
-
21
25
 
22
26
  class ImageExtractor(Extractor):
23
27
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
@@ -56,13 +60,11 @@ class ImageExtractor(Extractor):
56
60
  if self.config.ocr_backend is None:
57
61
  raise ValidationError("ocr_backend is None, cannot perform OCR")
58
62
 
59
- return await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
63
+ result = await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
64
+ return self._apply_quality_processing(result)
60
65
 
61
66
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
62
67
  """Pure sync implementation of extract_bytes."""
63
- import os
64
- import tempfile
65
-
66
68
  extension = self._get_extension_from_mime_type(self.mime_type)
67
69
  fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
68
70
 
@@ -80,23 +82,26 @@ class ImageExtractor(Extractor):
80
82
  if self.config.ocr_backend is None:
81
83
  raise ValidationError("ocr_backend is None, cannot perform OCR")
82
84
 
83
- from kreuzberg._ocr._tesseract import TesseractConfig
84
- from kreuzberg._types import ExtractionResult
85
+ backend = get_ocr_backend(self.config.ocr_backend)
85
86
 
86
87
  if self.config.ocr_backend == "tesseract":
87
- from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
88
-
89
- if isinstance(self.config.ocr_config, TesseractConfig):
90
- config = self.config.ocr_config
91
- else:
92
- config = TesseractConfig()
93
-
94
- results = process_batch_images_sync_pure([str(path)], config)
95
- if results:
96
- return results[0]
97
- return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
98
-
99
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
88
+ config = (
89
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
90
+ )
91
+ result = backend.process_file_sync(path, **config.__dict__)
92
+ elif self.config.ocr_backend == "paddleocr":
93
+ paddle_config = (
94
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
95
+ )
96
+ result = backend.process_file_sync(path, **paddle_config.__dict__)
97
+ elif self.config.ocr_backend == "easyocr":
98
+ easy_config = (
99
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
100
+ )
101
+ result = backend.process_file_sync(path, **easy_config.__dict__)
102
+ else:
103
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
104
+ return self._apply_quality_processing(result)
100
105
 
101
106
  def _get_extension_from_mime_type(self, mime_type: str) -> str:
102
107
  if mime_type in self.IMAGE_MIME_TYPE_EXT_MAP: